{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "76c38b11",
   "metadata": {},
   "source": [
    "# The most important idea behind training the model\n",
    "\n",
    "`curriculum learning` is the main idea behind train the model. because we use character level tokenization for input (transformer encoding layer) and sub-word level tokenization for output (transformer decoder layer) it's important to learn alignment of input-output token pair first. unless input output token alignment the model cannot produce correct output from given input even the correct input were given."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b7370bc7",
   "metadata": {},
   "source": [
    "### The first step: Generate individual words from all the text dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "5e1abbdc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "files: 59000.json, 31000.json, 69000.json, 34000.json, 101000.json, 143000.json, 122000.json, 108000.json, 14000.json, 43000.json, 90000.json, 107000.json, 56000.json, 49000.json, 55000.json, 20000.json, 28000.json, 106000.json, 4000.json, 70000.json, 62000.json, 39000.json, 33000.json, 103000.json, 61000.json, 11000.json, 137000.json, 87000.json, 17000.json, 127000.json, 124000.json, 65000.json, 48000.json, 50000.json, 94000.json, 52000.json, 138000.json, 66000.json, 7000.json, 134000.json, 123000.json, 8000.json, 24000.json, 116000.json, 19000.json, 125000.json, 131000.json, 79000.json, 104000.json, 118000.json, 84000.json, 60000.json, 53000.json, 5000.json, 76000.json, 73000.json, 1000.json, 35000.json, 6000.json, 93000.json, 92000.json, 67000.json, 88000.json, 105000.json, 16000.json, 36000.json, 21000.json, 78000.json, 18000.json, 64000.json, 100000.json, 97000.json, 140000.json, 10000.json, 135000.json, 12000.json, 109000.json, 41000.json, 130000.json, 42000.json, dataset.json, 38000.json, 71000.json, 23000.json, 139000.json, 83000.json, 75000.json, 119000.json, 37000.json, 45000.json, 2000.json, 110000.json, 27000.json, 117000.json, 29000.json, 115000.json, 13000.json, 54000.json, 129000.json, 120000.json, 91000.json, 126000.json, 47000.json, 9000.json, 82000.json, 63000.json, 22000.json, 3000.json, 132000.json, 26000.json, 85000.json, 133000.json, 111000.json, 141000.json, 142000.json, 68000.json, 72000.json, 0.json, 15000.json, 95000.json, 121000.json, 58000.json, 30000.json, 136000.json, 77000.json, 89000.json, 25000.json, 96000.json, 102000.json, 44000.json, 98000.json, 99000.json, 143747.json, 113000.json, 128000.json, 74000.json, 51000.json, 57000.json, 32000.json, 112000.json, 40000.json, 81000.json, 114000.json, 46000.json, 86000.json, 80000.json\n",
      "count: 146\n"
     ]
    }
   ],
   "source": [
    "from pathlib import Path\n",
    "\n",
    "data_dir = Path.cwd().parent.parent / \".data\" / \"text\"\n",
    "files_to_be_included = [f for f in data_dir.iterdir() if f.is_file() and f.name.endswith(\".json\")]\n",
    "# files_to_be_included = [data_dir / f\"{i * 1000}.json\" for i in range(25)]\n",
    "print(f\"files: {\", \".join([i.name for i in files_to_be_included])}\")\n",
    "print(f\"count: {len(files_to_be_included)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8cd59c15",
   "metadata": {},
   "source": [
    "Read all the file contents into a variable."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "d93ef296",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 146/146 [00:02<00:00, 71.29it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "All the sentence count: 2866036\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "import json\n",
    "\n",
    "sentences: list[str] = []\n",
    "\n",
    "for i in tqdm(files_to_be_included, total=len(files_to_be_included)):\n",
    "    with open(i, \"r\") as f:\n",
    "        sentences_in_file: list[str] = json.load(f)\n",
    "        expanded_sentences: list[str] = []\n",
    "        for s in sentences_in_file:\n",
    "            ss = s.split(\".\")\n",
    "            for s in ss:\n",
    "                s = s.strip()\n",
    "                if s == \"\":\n",
    "                    continue\n",
    "                expanded_sentences.append(s)\n",
    "    sentences.extend(expanded_sentences)\n",
    "\n",
    "print(f\"All the sentence count: {len(sentences)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "afbd1fc7",
   "metadata": {},
   "source": [
    "split them with whitespace and -, : symbols"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "862a2279",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 2866036/2866036 [00:21<00:00, 130785.89it/s]\n",
      "100%|██████████| 2866036/2866036 [00:00<00:00, 3642803.67it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Extracted 53308345 words.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "import re\n",
    "\n",
    "def split_and_clean_sentences(sentences: list[str]) -> list[str]:\n",
    "    \"\"\"\n",
    "    Splits each sentence in the input list by common symbols. like whitespace, -, :, =, ،, ., !, ?, ), (, », «, ؛\n",
    "    Returns a flat list of cleaned sentence fragments (non-empty).\n",
    "    \"\"\"\n",
    "    common_symbols = r\"[.\\s:\\-=،!?؟()»«؛،.›‹]+|[\\d_]\"\n",
    "    lists = [re.split(common_symbols, sentence) for sentence in tqdm(sentences)]\n",
    "    return [item for sublist in tqdm(lists) for item in sublist]\n",
    "words = split_and_clean_sentences(sentences)\n",
    "print(f\"Extracted {len(words)} words.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "id": "f8c58ae9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "53308345\n",
      "Unique word count: 715371\n"
     ]
    }
   ],
   "source": [
    "def deduplicate_words(words: list[str]) -> list[str]:\n",
    "    \"\"\"\n",
    "    Removes duplicate words while preserving order of first appearance.\n",
    "    \"\"\"\n",
    "    print(len(words))\n",
    "    return list(set(words))\n",
    "\n",
    "unique_words = deduplicate_words(words)\n",
    "print(f\"Unique word count: {len(unique_words)}\")\n",
    "\n",
    "with open(data_dir.parent / \"individual_words.json\", \"w\") as f:\n",
    "    json.dump(unique_words, f, ensure_ascii=False, indent=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "id": "bc88ad80",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Unique word count after cleaning: 715370\n",
      "686103\n",
      "Unique Uyghur word count after removing English characters: 678761\n"
     ]
    }
   ],
   "source": [
    "# The unique words contain meaningless characters like \".\", \"!\", \"?\", and English words. \n",
    "# Since this is an Uyghur language dataset of individual words, we need to remove them to make the dataset clean.\n",
    "\n",
    "# First, clean common symbols like \".\", \"!\", \"?\", \")\", \"(\" and numbers.\n",
    "\n",
    "import re\n",
    "\n",
    "def clean_common_symbols(words: list[str]) -> list[str]:\n",
    "    \"\"\"\n",
    "    Cleans common symbols from the input list of words.\n",
    "    \n",
    "    Removes punctuation marks, numbers, and other non-alphabetic characters\n",
    "    from each word in the list. Returns a list of cleaned words.\n",
    "    \"\"\"\n",
    "\n",
    "    common_symbols = r\"[.\\s:\\-=،!?؟()»«؛،.›‹]+|[\\d_]\"\n",
    "\n",
    "    cleaned_words = []\n",
    "    for word in words:\n",
    "        # Remove common symbols, numbers, and punctuation\n",
    "        cleaned = re.sub(common_symbols, '', word)\n",
    "        if cleaned:  # Only add non-empty words\n",
    "            cleaned_words.append(cleaned)\n",
    "    return cleaned_words\n",
    "\n",
    "# Remove empty words\n",
    "unique_words = [word for word in unique_words if word]\n",
    "print(f\"Unique word count after cleaning: {len(unique_words)}\")\n",
    "\n",
    "# Remove English characters from words\n",
    "\n",
    "def clean_english_characters(word: str) -> str:\n",
    "    \"\"\"\n",
    "    Removes all English alphabet characters from a word.\n",
    "    \n",
    "    Args:\n",
    "        word: The word to clean\n",
    "        \n",
    "    Returns:\n",
    "        The word with all English characters removed\n",
    "    \"\"\"\n",
    "    # Remove all English alphabet characters (both uppercase and lowercase)\n",
    "    return re.sub(r'[a-zA-Z]', '', word)\n",
    "\n",
    "def clean_non_uyghur_words(words: list[str]) -> list[str]:\n",
    "    \"\"\"\n",
    "    Cleans the list of words by removing English characters.\n",
    "    \n",
    "    Args:\n",
    "        words: List of words to clean\n",
    "        \n",
    "    Returns:\n",
    "        List of cleaned words with English characters removed\n",
    "    \"\"\"\n",
    "    # First clean common symbols\n",
    "    cleaned_words = clean_common_symbols(words)\n",
    "    \n",
    "    # Then remove English characters from each word\n",
    "    filtered_words = []\n",
    "    for word in cleaned_words:\n",
    "        cleaned_word = clean_english_characters(word)\n",
    "        if cleaned_word:  # Only keep non-empty words\n",
    "            filtered_words.append(cleaned_word)\n",
    "    \n",
    "    return filtered_words\n",
    "\n",
    "# Clean the unique words by removing symbols and English characters\n",
    "cleaned_unique_words = clean_non_uyghur_words(unique_words)\n",
    "cleaned_unique_words = deduplicate_words(cleaned_unique_words)\n",
    "print(f\"Unique Uyghur word count after removing English characters: {len(cleaned_unique_words)}\")\n",
    "\n",
    "# Save the cleaned words to a new file\n",
    "with open(data_dir.parent / \"individual_words.json\", \"w\") as f:\n",
    "    json.dump(cleaned_unique_words, f, ensure_ascii=False, indent=2)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "id": "1e3a5c16",
   "metadata": {},
   "outputs": [],
   "source": [
    "#sort the words by length\n",
    "cleaned_unique_words.sort(key=len)\n",
    "\n",
    "cleaned_unique_words = cleaned_unique_words[10_000:500_000]\n",
    "\n",
    "#save the sorted words to a new file\n",
    "with open(data_dir.parent / \"individual_words.json\", \"w\") as f:\n",
    "    json.dump(cleaned_unique_words, f, ensure_ascii=False, indent=2)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "2b71c9fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open(\"/home/dream-lab/project/spell_correction/.data/words.json\", \"r\") as f:\n",
    "    lines = f.read()\n",
    "\n",
    "with open(\"/home/dream-lab/project/spell_correction/.data/words.json\", \"w\") as f:\n",
    "    json.dump(lines, f, ensure_ascii=False, indent=2)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "spell-correction",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
